In [1]:
import numpy
import wqio
import pynsqd
import pycvc
def get_cvc_parameter(nsqdparam):
try:
cvcparam = list(filter(
lambda p: p['nsqdname'] == nsqdparam, pycvc.info.POC_dicts
))[0]['cvcname']
except IndexError:
cvcparam = numpy.nan
return cvcparam
def fix_nsqd_bacteria_units(df, unitscol='units'):
df[unitscol] = df[unitscol].replace(to_replace='MPN/100 mL', value='CFU/100 mL')
return df
nsqd_params = [
p['nsqdname']
for p in pycvc.info.POC_dicts
]
In [2]:
raw_data = pynsqd.NSQData().data
clean_data = (
raw_data
.query("primary_landuse != 'Unknown'")
.query("parameter in @nsqd_params")
.query("fraction == 'Total'")
.query("epa_rain_zone == 1")
.assign(station='outflow')
.assign(cvcparam=lambda df: df['parameter'].apply(get_cvc_parameter))
.assign(season=lambda df: df['start_date'].apply(wqio.utils.getSeason))
.drop('parameter', axis=1)
.rename(columns={'cvcparam': 'parameter'})
.pipe(fix_nsqd_bacteria_units)
.query("primary_landuse == 'Residential'")
)
In [3]:
clean_data.groupby(by=['parameter', 'season']).size().unstack(level='season')
Out[3]:
In [4]:
(
clean_data
.query("parameter == 'Total Suspended Solids'")
.to_csv('NSQD_Res_TSS.csv', index=False)
)